Assignment-1

Author

YangXu

library(tidyverse)
library(data.table)
library(leaflet)
library(ggplot2)

Step 1

Read in the data and conduct EDA checklist items 2-4

# Read in the data using data.table()
pm2.5_2002 <- data.table::fread("ad_viz_plotval_data_2002.csv")
pm2.5_2022 <- data.table::fread("ad_viz_plotval_data_2022.csv")

# check the dimensions, headers, footers, variable names and variable types
# For data of 2002
# dimensions
dim(pm2.5_2002)
[1] 15976    20
# headers
head(pm2.5_2002)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/05/2002    AQS 60010007   1                           25.1 ug/m3 LC
2: 01/06/2002    AQS 60010007   1                           31.6 ug/m3 LC
3: 01/08/2002    AQS 60010007   1                           21.4 ug/m3 LC
4: 01/11/2002    AQS 60010007   1                           25.9 ug/m3 LC
5: 01/14/2002    AQS 60010007   1                           34.5 ug/m3 LC
6: 01/17/2002    AQS 60010007   1                           41.0 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              78 Livermore               1              100
2:              92 Livermore               1              100
3:              71 Livermore               1              100
4:              80 Livermore               1              100
5:              98 Livermore               1              100
6:             115 Livermore               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     41860
2:              88101 PM2.5 - Local Conditions     41860
3:              88101 PM2.5 - Local Conditions     41860
4:              88101 PM2.5 - Local Conditions     41860
5:              88101 PM2.5 - Local Conditions     41860
6:              88101 PM2.5 - Local Conditions     41860
                           CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
   SITE_LATITUDE SITE_LONGITUDE
1:      37.68753      -121.7842
2:      37.68753      -121.7842
3:      37.68753      -121.7842
4:      37.68753      -121.7842
5:      37.68753      -121.7842
6:      37.68753      -121.7842
# footers
tail(pm2.5_2002)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
2: 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
3: 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
4: 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
5: 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
6: 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
   DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              57 Woodland-Gibson Road               1              100
2:              57 Woodland-Gibson Road               1              100
3:               4 Woodland-Gibson Road               1              100
4:              74 Woodland-Gibson Road               1              100
5:              21 Woodland-Gibson Road               1              100
6:              25 Woodland-Gibson Road               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     40900
2:              88101 PM2.5 - Local Conditions     40900
3:              88101 PM2.5 - Local Conditions     40900
4:              88101 PM2.5 - Local Conditions     40900
5:              88101 PM2.5 - Local Conditions     40900
6:              88101 PM2.5 - Local Conditions     40900
                                 CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
   COUNTY SITE_LATITUDE SITE_LONGITUDE
1:   Yolo      38.66121      -121.7327
2:   Yolo      38.66121      -121.7327
3:   Yolo      38.66121      -121.7327
4:   Yolo      38.66121      -121.7327
5:   Yolo      38.66121      -121.7327
6:   Yolo      38.66121      -121.7327
# variable names and types
str(pm2.5_2002)
Classes 'data.table' and 'data.frame':  15976 obs. of  20 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily Mean PM2.5 Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  78 92 71 80 98 115 87 57 65 107 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 
# For data in 2022
# dimensions
dim(pm2.5_2022)
[1] 56140    20
# headers
head(pm2.5_2022)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/01/2022    AQS 60010007   3                           12.7 ug/m3 LC
2: 01/02/2022    AQS 60010007   3                           13.9 ug/m3 LC
3: 01/03/2022    AQS 60010007   3                            7.1 ug/m3 LC
4: 01/04/2022    AQS 60010007   3                            3.7 ug/m3 LC
5: 01/05/2022    AQS 60010007   3                            4.2 ug/m3 LC
6: 01/06/2022    AQS 60010007   3                            3.8 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              52 Livermore               1              100
2:              55 Livermore               1              100
3:              30 Livermore               1              100
4:              15 Livermore               1              100
5:              18 Livermore               1              100
6:              16 Livermore               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     41860
2:              88101 PM2.5 - Local Conditions     41860
3:              88101 PM2.5 - Local Conditions     41860
4:              88101 PM2.5 - Local Conditions     41860
5:              88101 PM2.5 - Local Conditions     41860
6:              88101 PM2.5 - Local Conditions     41860
                           CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
   SITE_LATITUDE SITE_LONGITUDE
1:      37.68753      -121.7842
2:      37.68753      -121.7842
3:      37.68753      -121.7842
4:      37.68753      -121.7842
5:      37.68753      -121.7842
6:      37.68753      -121.7842
# footers
tail(pm2.5_2022)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 12/01/2022    AQS 61131003   1                            3.4 ug/m3 LC
2: 12/07/2022    AQS 61131003   1                            3.8 ug/m3 LC
3: 12/13/2022    AQS 61131003   1                            6.0 ug/m3 LC
4: 12/19/2022    AQS 61131003   1                           34.8 ug/m3 LC
5: 12/25/2022    AQS 61131003   1                           23.2 ug/m3 LC
6: 12/31/2022    AQS 61131003   1                            1.0 ug/m3 LC
   DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              14 Woodland-Gibson Road               1              100
2:              16 Woodland-Gibson Road               1              100
3:              25 Woodland-Gibson Road               1              100
4:              99 Woodland-Gibson Road               1              100
5:              74 Woodland-Gibson Road               1              100
6:               4 Woodland-Gibson Road               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     40900
2:              88101 PM2.5 - Local Conditions     40900
3:              88101 PM2.5 - Local Conditions     40900
4:              88101 PM2.5 - Local Conditions     40900
5:              88101 PM2.5 - Local Conditions     40900
6:              88101 PM2.5 - Local Conditions     40900
                                 CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
   COUNTY SITE_LATITUDE SITE_LONGITUDE
1:   Yolo      38.66121      -121.7327
2:   Yolo      38.66121      -121.7327
3:   Yolo      38.66121      -121.7327
4:   Yolo      38.66121      -121.7327
5:   Yolo      38.66121      -121.7327
6:   Yolo      38.66121      -121.7327
# variable names and types
str(pm2.5_2022)
Classes 'data.table' and 'data.frame':  56140 obs. of  20 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily Mean PM2.5 Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  52 55 30 15 18 16 10 29 54 47 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 
# check for any data issues
summary(pm2.5_2002$`Daily Mean PM2.5 Concentration`)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    7.00   12.00   16.12   20.50  104.30 
summary(pm2.5_2022$`Daily Mean PM2.5 Concentration`)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  -2.20    4.20    6.90    8.52   10.80  302.50 

It dose not make sense to have a daily mean PM2.5 concentration to be -2.2. So I decide to remove all the parts that PM2.5 < 0.

pm2.5_2022 <- pm2.5_2022[`Daily Mean PM2.5 Concentration` >= 0,]
summary(pm2.5_2022$`Daily Mean PM2.5 Concentration`)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   4.200   7.000   8.554  10.800 302.500 

Summary: The mean of daily mean PM2.5 concentration in 2022 is 8.554 ug/m3 LC, which is lower than 16.12 ug/m3 LC in 2002. This may give us a possible sign that daily PM2.5 concentration have decreased in California over the last 20 years.

Step 2

Combine the two years of data into one data frame. Use the Date variable to create a new column for year, which will serve as an identifier. Change the names of the key variables so that they are easier to refer to in your code.

# Create new column of 'year'
pm2.5_2002 <- pm2.5_2002[, year := 2002]
pm2.5_2022 <- pm2.5_2022[, year := 2022]

# Combine two years of data into one
dat_pm2.5 <- rbind(pm2.5_2002, pm2.5_2022)

# Change the names of key variables
setnames(dat_pm2.5,c("Daily Mean PM2.5 Concentration", "SITE_LATITUDE", "SITE_LONGITUDE", "Site Name"),                            c("pm2.5", "lat", "lon", "site"))

Step 3

Create a basic map in leaflet() that shows the locations of the sites (make sure to use different colors for each year). Summarize the spatial distribution of the monitoring sites.

leaflet(dat_pm2.5) %>% 
  addTiles() %>% 
  addCircles(
    lng = ~lon,
    lat = ~lat,
    color = ~ifelse(year == 2002, "red", "blue"), 
    weight = 2, 
    opacity =1,
    fillOpacity = 1, 
    radius = 100,
    popup = ~site, 
    label = "Map of Sites Measured in 2002(red) and 2022 (blue)")

The map shows that the number of blue spots(2022) is much more than the number of red spots(2002). And also the blue spots are more wide-spread than red spots.

Step 4

Check for any missing or implausible values of PM2.5 in the combined dataset. Explore the proportions of each and provide a summary of any temporal patterns you see in these observations.

# Check for missing value
sum(is.na(dat_pm2.5$pm2.5))
[1] 0
# Check for implausible values
summary(dat_pm2.5$pm2.5)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    4.60    7.70   10.23   12.40  302.50 

Step 5

Explore the main question of interest at three different spatial levels. Create exploratory plots (e.g. boxplots, histograms, line plots) and summary statistics that best suit each level of data. Be sure to write up explanations of what you observe in these data.

State Level

# Boxplot
dat_pm2.5[!is.na(year)] %>%
  ggplot() +
    geom_boxplot(mapping = aes(x = year, y = pm2.5, group = year))

# Barcharts
average_pm <- dat_pm2.5 %>%
  group_by(year) %>%
  summarize(avg_pm = mean(pm2.5, na.rm = TRUE))
average_pm %>%
  ggplot() +
  geom_bar(mapping = aes(x = year, y = stat(average_pm$avg_pm) , group = 1))
Warning: `stat(average_pm$avg_pm)` was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(average_pm$avg_pm)` instead.

# Summary statistics
tapply(dat_pm2.5$pm2.5, dat_pm2.5$year, summary)
$`2002`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    7.00   12.00   16.12   20.50  104.30 

$`2022`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   4.200   7.000   8.554  10.800 302.500 
t.test(pm2.5_2002$`Daily Mean PM2.5 Concentration`, pm2.5_2022$`Daily Mean PM2.5 Concentration`, paired = FALSE)

    Welch Two Sample t-test

data:  pm2.5_2002$`Daily Mean PM2.5 Concentration` and pm2.5_2022$`Daily Mean PM2.5 Concentration`
t = 66.136, df = 18805, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 7.337922 7.786156
sample estimates:
mean of x mean of y 
16.115943  8.553904 

From the result, there was a statistically significant decrease in daily mean PM2.5 concentration from 2002 to 2022.

County Level

pm2.5_County = dat_pm2.5 %>% 
  filter(dat_pm2.5$COUNTY %in% 
           intersect(pm2.5_2002$COUNTY, pm2.5_2022$COUNTY))
County = group_by(pm2.5_County, year, COUNTY) %>% 
  summarize(`pm2.5` = mean(`pm2.5`, na.rm = TRUE), .groups = "drop") 

Daily_PM2.5_Concentration <- (County$`pm2.5`)
# Boxplot
County %>%
  ggplot() +
  geom_boxplot(mapping = aes(x = year, y = pm2.5, group = COUNTY))

average_pm_by_county_02 <- dat_pm2.5[dat_pm2.5$year == 2002, ] %>%
  group_by(COUNTY) %>% 
  summarize(
    average_pm_2002 = mean(pm2.5, na.rm = TRUE)
  )
average_pm_by_county_22 <- dat_pm2.5[dat_pm2.5$year == 2022, ] %>%
  group_by(COUNTY) %>% 
  summarize(
    average_pm_2022 = mean(pm2.5, na.rm = TRUE)
  )
t.test(average_pm_by_county_02$average_pm_2002, average_pm_by_county_22$average_pm_2022, paired = FALSE)

    Welch Two Sample t-test

data:  average_pm_by_county_02$average_pm_2002 and average_pm_by_county_22$average_pm_2022
t = 4.841, df = 60.526, p-value = 9.299e-06
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 2.582337 6.218018
sample estimates:
mean of x mean of y 
 12.23913   7.83895 

From the result, there was a statistically significant decrease in daily mean PM2.5 concentration by county from 2002 to 2022.

Site Level

average_pm_by_site_02 <- dat_pm2.5[dat_pm2.5$year == 2002, ] %>%
  group_by(site) %>% 
  summarize(
    average_site_pm_2002 = mean(pm2.5, na.rm = TRUE),
    Year = mean(year),
    Lat = mean(lat), 
    Lon = mean(lon)
  )
average_pm_by_site_22 <- dat_pm2.5[dat_pm2.5$year == 2022, ] %>%
  group_by(site) %>% 
  summarize(
    average_site_pm_2022 = mean(pm2.5, na.rm = TRUE),
    Year = mean(year),
    Lat = mean(lat), 
    Lon = mean(lon)
  )
# leaflet maps
Site_mean <- rbindlist(list(
  average_pm_by_site_02, 
  average_pm_by_site_22),fill = TRUE)

color_palette <- colorNumeric(
  palette = "viridis",  
  domain = Site_mean$average_pm_2002_site
)

temp.pal02 <- colorNumeric(c('pink','deeppink','deeppink4'), domain=average_pm_by_site_02$average_site_pm_2002)

leaflet02 <- leaflet(average_pm_by_site_02) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addCircles(
    lat = ~Lat, lng=~Lon,
    label = ~paste0(round(average_pm_by_site_02$average_site_pm_2002,2), ' PM2.5'), color = ~ temp.pal02(average_pm_by_site_02$average_site_pm_2002),
    opacity = 1, fillOpacity = 1, radius = 500
    ) %>%
  addLegend('bottomleft', pal=temp.pal02, values=average_pm_by_site_02$average_site_pm_2002,
          title='Mean Concentrations PM2.5 by site in 2002', opacity=1)

leaflet22 <- leaflet(average_pm_by_site_22) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addCircles(
    lat = ~Lat, lng=~Lon,
    label = ~paste0(round(average_pm_by_site_22$average_site_pm_2022,2), ' PM2.5'), color = ~ temp.pal02(average_pm_by_site_02$average_site_pm_2002),
    opacity = 1, fillOpacity = 1, radius = 500
    ) %>%
  addLegend('bottomleft', pal=temp.pal02, values=average_pm_by_site_02$average_site_pm_2002,
          title='Mean Concentrations PM2.5 by site in 2022', opacity=1)

leaflet02
leaflet22
# t-test
t.test(average_pm_by_site_02$average_site_pm_2002, average_pm_by_site_22$average_site_pm_2022, paired = FALSE)

    Welch Two Sample t-test

data:  average_pm_by_site_02$average_site_pm_2002 and average_pm_by_site_22$average_site_pm_2022
t = 7.2463, df = 126.67, p-value = 3.71e-11
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 3.732122 6.536277
sample estimates:
mean of x mean of y 
13.211227  8.077028 

From the result, there was a statistically significant decrease in the daily average PM2.5 concentrations by site from 2002 to 2022.